#ifndef _INDEX_H_
#define _INDEX_H_

#include <fstream>
#include <sstream>
#include <vector>
#include <cstring>
#include "DocumentVector.h"
#include "Dictionary.h"
#include "HTMLDocument.h"
#include "stringtok.h"

// TODO: Maybe use "<<" and ">>" operators instead of load and dump

class Index 
{

private:

	Dictionary dictionary;
	std::vector<DocumentVector> documents;
	char* index_name;
	char* dict_name;
	char* docs_name;

	friend class ConsineSimilarity;
	class CosineSimilarity
	{

	private:
		friend class Index;
			
		Index& index;
		
		CosineSimilarity(Index& idx) : index(idx) {}
		
		// dot product calculated using tf-idf
		double dot(const DocumentVector &dv1, const DocumentVector &dv2)
		{
			double dot = 0.0;
			int max_index = dv1.get_max_word_index();
			int min_index = dv1.get_min_word_index();

			if(max_index > dv2.get_max_word_index())
				max_index = dv2.get_max_word_index();

			if(min_index < dv2.get_min_word_index())
				min_index = dv2.get_min_word_index();
			

			for(int i = min_index; i <= max_index; i++)
			{
				dot += dv1[i] * dv2[i];
			}
			
			// double inv_freq;
			// for(int i = min_index; i <= max_index; i++)
			// {
			// 	inv_freq = 1 / index.term_frequency(i);
			// 	dot += (dv1[i] * inv_freq) * (dv2[i] * inv_freq);
			// }
				
			
			// double tf1, tf2, idf;
			// for(int i = min_index; i <= max_index; i++)
			// {	
			// 	idf = log(index.documents.size() / index.docs_containing(i));
			// 	tf1 = dv1[i] / dv1.get_num_words();
			// 	tf2 = dv2[i] / dv2.get_num_words();
			// 	
			// 	dot += (tf1 * idf) * (tf2 * idf);
			// }

			return dot;
		}

		double length(const DocumentVector& dv)
		{
			int length = 0;

			for(int i = dv.get_min_word_index(); i <= dv.get_max_word_index(); i++)
				length += dv[i] * dv[i];

			return sqrt(length);
		}
		
	public:
		double similarity(const DocumentVector& dv1, const DocumentVector& dv2)
		{
			return dot(dv1, dv2) / (length(dv1) * length(dv2));
		}
	};

public:

	Index(char* index_name_) : index_name(index_name_) 
	{
 		dict_name = "index/index.dict";
		docs_name = "index/index.docs";
	} 

	const std::vector<DocumentVector>& get_documents() 
	{	
		return documents;
	}
	
	const Dictionary& get_dictionary() 
	{
		return dictionary;
	}
	
	int term_frequency(int index)
	{
		int frequency = 0;
		DocumentVector tmp;
		for(unsigned int i = 0; i < documents.size(); i++)
		{
			tmp = documents[i];
			frequency += tmp[index];
		}

		return frequency;
	}
	
	int docs_containing(int index)
	{
		int num_docs = 0;
		DocumentVector tmp;
		for(unsigned int i = 0; i < documents.size(); i++)
		{
			tmp = documents[i];
			
			if(tmp[index] != 0)
				num_docs++;
		}

		return num_docs;
	}
	
	
	double similarity(DocumentVector& dv1, DocumentVector& dv2)
	{
		CosineSimilarity sim(*this);
		return sim.similarity(dv1, dv2);
	}
	
	void add_document(HTMLDocument& doc) 
	{
		std::stringstream content;
		content << doc;

		std::vector<std::string> words;		
		stringtok (words, content.str());

		int word_index = 0;

		for (unsigned int i = 0; i < words.size(); ++i)
			word_index = dictionary.add_word(words[i]);
		
		DocumentVector dv(dictionary.size()+1, doc.get_filename());
		for (unsigned int i = 0; i < words.size(); ++i)
		{
			word_index = dictionary[words[i]];
			dv[word_index]++;
		}	

	 	documents.insert(documents.end(), dv);
	}

	void load()
	{
		// load dictionary
		std::ifstream fin(dict_name);
		fin >> dictionary;
		fin.close();


		// load documents
		int num_docs = 0;
		DocumentVector tmp_dv;
		fin.open(docs_name);
		fin >> num_docs;

		for(int i = 0; i < num_docs; i++) 
		{
			fin >> tmp_dv;
			documents.insert(documents.end(), tmp_dv);
		}
		fin.close();
	}

	void dump() 
	{
		// dump dictionary
		std::ofstream fout(dict_name);
		fout << dictionary;
		fout.close();

		// dump documents
		fout.open(docs_name);
		fout << documents.size() << std::endl;
		for(unsigned int i = 0; i < documents.size(); i++) 
			fout << documents[i] << std::endl;	
		fout.close();
	}
};

#endif /* _INDEX_H_ */
